<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.4.0">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">

<script id="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"example.com","root":"/","scheme":"Pisces","version":"7.8.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12,"onmobile":false},"copycode":{"enable":false,"show_result":false,"style":null},"back2top":{"enable":true,"sidebar":false,"scrollpercent":true},"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":false,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}}};
  </script>

  <meta name="description" content="此篇为学习强化学习的个人笔记，对应书籍Reinforcement Learning An Introduction，第三章Finite Markov Decision Processes.">
<meta property="og:type" content="article">
<meta property="og:title" content="3-有限马尔可夫决策过程">
<meta property="og:url" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/index.html">
<meta property="og:site_name" content="祖浩の博客">
<meta property="og:description" content="此篇为学习强化学习的个人笔记，对应书籍Reinforcement Learning An Introduction，第三章Finite Markov Decision Processes.">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/image-20211023213702000.png">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/MDP1.png">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/image-20211024220454457.png">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/image-20211026102701417.png">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/image-20211026152532511.png">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/2021-10-27_19-53-37.png">
<meta property="og:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/2021-10-26-20-53-18.jpg">
<meta property="article:published_time" content="2021-10-25T08:09:17.000Z">
<meta property="article:modified_time" content="2022-02-28T02:03:49.347Z">
<meta property="article:author" content="谢祖浩">
<meta property="article:tag" content="强化学习">
<meta property="article:tag" content="MDP">
<meta property="article:tag" content="贝尔曼公式">
<meta property="article:tag" content="回溯图">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/image-20211023213702000.png">

<link rel="canonical" href="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : false,
    isPost : true,
    lang   : 'zh-CN'
  };
</script>

  <title>3-有限马尔可夫决策过程 | 祖浩の博客</title>
  






  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container use-motion">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <span class="logo-line-before"><i></i></span>
      <h1 class="site-title">祖浩の博客</h1>
      <span class="logo-line-after"><i></i></span>
    </a>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
    </div>
  </div>
</div>




<nav class="site-nav">
  <ul id="menu" class="main-menu menu">
        <li class="menu-item menu-item-home">

    <a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a>

  </li>
        <li class="menu-item menu-item-tags">

    <a href="/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>标签</a>

  </li>
        <li class="menu-item menu-item-categories">

    <a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类</a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a>

  </li>
  </ul>
</nav>




</div>
    </header>

    
  <div class="back-to-top">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content post posts-expand">
            

    
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="http://example.com/2021/10/25/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/3-%E6%9C%89%E9%99%90%E9%A9%AC%E5%B0%94%E5%8F%AF%E5%A4%AB%E5%86%B3%E7%AD%96%E8%BF%87%E7%A8%8B/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/head.jpeg">
      <meta itemprop="name" content="谢祖浩">
      <meta itemprop="description" content="驽马十驾，功在不舍">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="祖浩の博客">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          3-有限马尔可夫决策过程
        </h1>

        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-calendar"></i>
              </span>
              <span class="post-meta-item-text">发表于</span>

              <time title="创建时间：2021-10-25 16:09:17" itemprop="dateCreated datePublished" datetime="2021-10-25T16:09:17+08:00">2021-10-25</time>
            </span>
              <span class="post-meta-item">
                <span class="post-meta-item-icon">
                  <i class="far fa-calendar-check"></i>
                </span>
                <span class="post-meta-item-text">更新于</span>
                <time title="修改时间：2022-02-28 10:03:49" itemprop="dateModified" datetime="2022-02-28T10:03:49+08:00">2022-02-28</time>
              </span>
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-folder"></i>
              </span>
              <span class="post-meta-item-text">分类于</span>
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/" itemprop="url" rel="index"><span itemprop="name">强化学习</span></a>
                </span>
            </span>

          
            <div class="post-description">此篇为学习强化学习的个人笔记，对应书籍Reinforcement Learning An Introduction，第三章Finite Markov Decision Processes.</div>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">

      
        <h1 id="基本概念">基本概念</h1>
<p><font face="仿宋"><strong>智能体</strong></font>（<span class="math inline">\(Agent\)</span>）：进行学习和决策的机器</p>
<p><font face="仿宋"><strong>环境</strong></font>（<span class="math inline">\(Environment\)</span>）​：智能体之外与其相互作用的实物</p>
<p><img src="image-20211023213702000.png" alt="image-20211023213702000" style="zoom: 33%;" /></p>
<p>在每个离散时刻<span class="math inline">\(t=0,1,2...\)</span>，智能体和环境发生了交互。在某个时刻<span class="math inline">\(t\)</span>，<span class="math inline">\(S_t\)</span> 表示智能体观察到环境状态；选择动作<span class="math inline">\(A_t\)</span> 。在下一个时刻，智能体接收到数值化的收益 <span class="math inline">\(R_{t+1}\)</span>，并且进入新的状态<span class="math inline">\(S_{t+1}\)</span>​.</p>
<p><font color=red><strong>有限MDP中，<span class="math inline">\(S,A,R\)</span>​都只有有限个。这种情况下随机变量<span class="math inline">\(R_{t}\)</span>​和<span class="math inline">\(S_t\)</span>​有明确的离散概率分布，且只依赖于前继状态<span class="math inline">\(S_{t-1}\)</span>​和前继动作<span class="math inline">\(A_{t-1}\)</span>​​，与更早的状态和动作无关。</strong></font></p>
<blockquote>
<p>这个限制不是针对决策过程，而是针对状态。</p>
</blockquote>
<p>给定前继状态<span class="math inline">\(s\)</span>和前继动作<span class="math inline">\(a\)</span>​，后继状态<span class="math inline">\(s&#39;\)</span>和收益<span class="math inline">\(r\)</span>出现概率用下公式表示：</p>
<p><span class="math display">\[
p\left(s^{\prime}, r \mid s, a\right) \doteq \operatorname{Pr}\left\{S_{t}=s^{\prime}, R_{t}=r \mid S_{t-1}=s, A_{t-1}=a\right\}
\]</span></p>
<p>函数<span class="math inline">\(p\)</span>定义了MDP的动态特性。公式中的 <span class="math inline">\(|\)</span> 是条件概率的表示，根据概率论，所有概率之和应当为1：</p>
<p><span class="math display">\[
\sum_{s^{\prime} \in \mathcal{S}} \sum_{r \in \mathcal{R}} p\left(s^{\prime}, r \mid s, a\right)=1, \text { for all } s \in \mathcal{S}, a \in \mathcal{A}(s)
\]</span></p>
<p>使用上述的四参数动态函数<span class="math inline">\(p\)</span>​，可以计算得出关于环境的其他信息，如<font color = red><strong>状态转移概率</strong></font>：</p>
<p><span class="math display">\[
p\left(s^{\prime} \mid s, a\right) \doteq \operatorname{Pr}\left\{S_{t}=s^{\prime} \mid S_{t-1}=s, A_{t-1}=a\right\}=\sum_{r \in \mathcal{R}} p\left(s^{\prime}, r \mid s, a\right)
\]</span></p>
<p>相当于边缘概率，计算前状态为<span class="math inline">\(s\)</span>​，动作为<span class="math inline">\(a\)</span>​，后继状态为<span class="math inline">\(s&#39;\)</span>​情况下，各种收益对应概率的和。</p>
<p>此外还有<strong>收益的期望值</strong>，收益与对应的概率相乘（tips:期望=值<span class="math inline">\(\times\)</span>概率）：</p>
<p><span class="math display">\[
\begin{equation}
r(s, a) \doteq \mathbb{E}\left[R_{t} \mid S_{t-1}=s, A_{t-1}=a\right]=\sum_{r \in \mathcal{R}} r \sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime}, r \mid s, a\right)\label{expected rewards}
\end{equation}
\]</span></p>
<p>下面式子中，因为未来状态<span class="math inline">\(s&#39;\)</span>已经给定，所以相对上式除以未来状态的概率。</p>
<p><span class="math display">\[
r\left(s, a, s^{\prime}\right) \doteq \mathbb{E}\left[R_{t} \mid S_{t-1}=s, A_{t-1}=a, S_{t}=s^{\prime}\right]=\sum_{r \in \mathcal{R}} r \frac{p\left(s^{\prime}, r \mid s, a\right)}{p\left(s^{\prime} \mid s, a\right)}
\]</span></p>
<p><strong>例：</strong>如下示意图中：</p>
<p><img src="MDP1.png" alt="MDP1" style="zoom: 50%;" /></p>
<p>从状态<span class="math inline">\(S_a\)</span>到<span class="math inline">\(S_b\)</span>有两种动作<span class="math inline">\(A_{b1},A_{b2}\)</span>，对应收益<span class="math inline">\(R_{b1},R_{b2}\)</span>。到<span class="math inline">\(S_d\)</span>有动作<span class="math inline">\(A_d\)</span>，两种收益<span class="math inline">\(R_{1d},R_{2d}\)</span>.</p>
<p>由上面公式，存在下面关系：</p>
<ul>
<li>状态<span class="math inline">\(S_a\)</span>​转移到<span class="math inline">\(S_b\)</span>​的<strong>状态转移概率</strong></li>
</ul>
<p><span class="math display">\[
p(S_b,R_{b1}|S_a,A_{b1}) + p(S_b,R_{b2}|S_a,A_{b2})
\]</span></p>
<ul>
<li>状态<span class="math inline">\(S_a\)</span>转移到<span class="math inline">\(S_d\)</span>​​的<strong>状态转移概率</strong></li>
</ul>
<p><span class="math display">\[
p(S_d,R_{1d}|S_a,A_d) + p(S_d,R_{2d}|S_a,A_d)
\]</span></p>
<ul>
<li>在状态<span class="math inline">\(S_a\)</span>时，选择动作<span class="math inline">\(A_d\)</span>转移到<span class="math inline">\(S_d\)</span>的概率</li>
</ul>
<p><span class="math display">\[
p(S_d|S_a,A_d) = p(S_d,{\color{red} R_{1d}}|S_a,A_d) + p(S_d,{\color{red} R_{2d}}|S_a,A_d)
\]</span></p>
<ul>
<li>在状态<span class="math inline">\(S_a\)</span>​时，选择动作<span class="math inline">\(A_d\)</span>​后，收益的期望值。</li>
</ul>
<p><span class="math display">\[
r(S_a,A_d) = R_{1d}·p(S_d,{\color{red} R_{1d}}|S_a,A_d) + R_{2d}·p(S_d,{\color{red} R_{2d}}|S_a,A_d)
\]</span></p>
<h1 id="回报与分幕">回报与分幕</h1>
<p>智能体的目标就是最大限度提高收益，一般情况下，寻求最大化<font face="仿宋"><strong>期望回报</strong></font>（<span class="math inline">\(expected~return\)</span>）。记作<span class="math inline">\(G_t\)</span>，它会被定义为收益序列<span class="math inline">\(R_{t+1},R_{t+2}...\)</span>的函数，最简单情况下，回报是收益的加和：</p>
<p><span class="math display">\[
G_t = R_{t+1}+R_{t+2}+...+R_{T}
\]</span></p>
<p>其中<span class="math inline">\(T\)</span>​表示最终时刻，这样的任务是有终结的。如一个棋局，最终时刻就是某一方胜出。这种任务被称为<strong>分幕式任务</strong>（<span class="math inline">\(episodic~tasks\)</span>​​）。定义，状态的全集为<span class="math inline">\(S^+\)</span>​，不包含终结状态的集合为<span class="math inline">\(S\)</span>​.</p>
<p>与之相对的是<strong>持续性任务</strong>（<span class="math inline">\(continuing~tasks\)</span>）。这种任务没有结尾，收益序列是无穷的，所以上面的回报函数无法在持续性任务中使用。我们引入参数<font face="仿宋"><strong>折扣率</strong></font>（<span class="math inline">\(discount rate\)</span>）：<span class="math inline">\(\gamma \in [0,1]\)</span>。</p>
<p><span class="math display">\[
G_{t} \doteq R_{t+1}+\gamma R_{t+2}+\gamma^{2} R_{t+3}+\cdots=\sum_{k=0}^{\infty} \gamma^{k} R_{t+k+1}
\]</span></p>
<ul>
<li>当折扣率为0时，智能体是短视的，只注重当前利益。</li>
<li>随着折扣率接近1，可以说智能体有远见。</li>
<li>只要收益为非0常数，且<span class="math inline">\(\gamma&lt;1\)</span>​​，回报有限，如当收益为+1，则回报为<span class="math inline">\(G_t=\sum^{\infty}_{k=0} \gamma^{k} = \frac{1}{1-\gamma}\)</span>​</li>
</ul>
<p>公式的递归形式为：</p>
<p><span class="math display">\[
\begin{aligned}
G_{t} &amp; \doteq R_{t+1}+\gamma R_{t+2}+\gamma^{2} R_{t+3}+\gamma^{3} R_{t+4}+\cdots \\
&amp;=R_{t+1}+\gamma\left(R_{t+2}+\gamma R_{t+3}+\gamma^{2} R_{t+4}+\cdots\right) \\
&amp;=R_{t+1}+\gamma G_{t+1}
\end{aligned}
\]</span></p>
<ul>
<li>例子：<span class="math inline">\(\gamma = 0.5,T=5\)</span>​​，收益序列：<span class="math inline">\(R_1=-1,R_2=2,R_3=6,R_4=3,R_5=2.\)</span>​​求解<span class="math inline">\(G_0...G_5\)</span>​​</li>
</ul>
<p>解：</p>
<p><span class="math display">\[
\begin{aligned}
G_5&amp;=R_6=0 \\
G_4&amp;=R_5=2 \\
G_3&amp;=R_4+\gamma R_5=4 \\
G_2&amp;=R_3+\gamma R_4+\gamma^2 R_5=8 \\
G_1&amp;=R_2+\gamma R_3+\gamma^2 R_4+\gamma^3 R_5=6 \\
G_0&amp;=R_1+\gamma R_2+\gamma^2 R_3+\gamma^3 R_4+\gamma^4 R_5=2 \\
\end{aligned}
\]</span></p>
<h1 id="策略和价值函数">策略和价值函数 ​</h1>
<h2 id="基本概念-1">基本概念</h2>
<p><font face="仿宋"><strong>价值函数</strong></font>（<span class="math inline">\(value~functions\)</span>​）：状态（和动作的）函数，用于评估当前智能体在给定状态（和动作）下“有多好”（回报的期望值）。</p>
<p><font face="仿宋"><strong>策略</strong></font>（<span class="math inline">\(policies\)</span>​）：状态和动作的选择概率之间的映射.</p>
<p><span class="math inline">\(\pi(a|s)\)</span>：​ 智能体在时刻 <span class="math inline">\(t\)</span> 状态<span class="math inline">\(S_t\)</span> 情况下，选择动作 <span class="math inline">\(A_t\)</span> 的概率。</p>
<ul>
<li>这里的 “<span class="math inline">\(|\)</span>” 只是为了说明，在每个状态下，动作 <span class="math inline">\(A_t\)</span> 都服从一定的概率分布。</li>
<li>强化学习方法规定了智能体策略如何随经验而变</li>
</ul>
<h2 id="价值函数">价值函数</h2>
<p><font face="仿宋"><strong>价值函数</strong></font>：从状态<span class="math inline">\(s\)</span>开始，智能体按照<span class="math inline">\(\pi\)</span>进行决策所获得的回报概率<strong>期望值</strong>，写作<span class="math inline">\(v_{\pi}(s)\)</span>​。MDP的<span class="math inline">\(v_{\pi}(s)\)</span>定义如下：</p>
<p><span class="math display">\[
\begin{equation}
v_{\pi}(s) \doteq \mathbb{E}_{\pi}\left[G_{t} \mid S_{t}=s\right]=\mathbb{E}_{\pi}\left[\sum_{k=0}^{\infty} \gamma^{k} R_{t+k+1} \mid S_{t}=s\right], \text { for all } s \in \mathcal{S} \label{def v_{\pi}(s)}
\end{equation}
\]</span></p>
<p><span class="math inline">\(\mathbb{E}_{\pi}\)</span>​表示，在给定策略<span class="math inline">\(\pi\)</span>​情况下某个随机变量的期望值。<span class="math inline">\(G_t\)</span>​ 为回报值，此公式就是把之前的持续性任务的回报公式代入。<span class="math inline">\(v_{\pi}\)</span>​ 也被称为<font face="仿宋" color=red><strong>策略</strong></font><span class="math inline">\(\color{red}\pi\)</span>​<font face="仿宋" color=red><strong>的状态价值函数</strong></font>。</p>
<p>同理，定义，从状态<span class="math inline">\(s\)</span>开始，智能体按照策略<span class="math inline">\(\pi\)</span>，执行动作<span class="math inline">\(a\)</span> 后所有可能决策序列的期望回报，写作<span class="math inline">\(q_{\pi}(s,a)\)</span>：</p>
<p><span class="math display">\[
\begin{equation}
q_{\pi}(s, a) \doteq \mathbb{E}_{\pi}\left[G_{t} \mid S_{t}=s, A_{t}=a\right]=\mathbb{E}_{\pi}\left[\sum_{k=0}^{\infty} \gamma^{k} R_{t+k+1} \mid S_{t}=s, A_{t}=a\right] \label{def p_{\pi}(s,a)}
\end{equation}
\]</span></p>
<p>称为<font face="仿宋" color=red><strong>策略</strong></font><span class="math inline">\(\color{red}\pi\)</span>​<font face="仿宋" color=red><strong>的动作价值函数</strong></font>。</p>
<h2 id="贝尔曼方程">贝尔曼方程</h2>
<h3 id="状态价值函数">状态价值函数</h3>
<p>在强化学习和动态规划中，价值函数都以可以用递归的形式表达，下面我们也把状态价值函数改写为递归形式：</p>
<p><span class="math display">\[
\begin{aligned}
v_{\pi}(s) &amp; \doteq \mathbb{E}_{\pi}\left[G_{t} \mid S_{t}=s\right] \\
&amp;=\mathbb{E}_{\pi}\left[{\color{brown}R_{t+1}+\gamma G_{t+1} } \mid S_{t}=s\right] \\
&amp;=\mathbb{E}_{\pi}\left[{\color{brown}R_{t+1}}\mid S_{t}=s\right]+{\color{brown}\gamma} \mathbb{E}_{\pi}\left[{\color{brown}G_{t+1}} \mid S_{t}=s\right]\\
&amp;=\sum_{a} \pi(a \mid s)\sum_{r \in \mathcal{R}} r \sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime}, r \mid s, a\right)+\gamma {\color{brown}\sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime} \mid s, a\right)}\mathbb{E}_{\pi}\left[G_{t+1} \mid {\color{brown}S_{t+1}=s&#39;} \right]\\
&amp;=\sum_{a} \pi(a \mid s)\sum_{r \in \mathcal{R}}  \sum_{s^{\prime} \in \mathcal{S}} r \cdot p\left(s^{\prime}, r \mid s, a\right)+ \gamma{\sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime} \mid s, a\right)} v_{\pi}(s&#39;) , \quad \text { for all } s \in \mathcal{S}
\end{aligned}
\]</span></p>
<p>证明说明：</p>
<ul>
<li><p>$G_{t} = R_{t+1}+G_{t+1} $</p></li>
<li><p><span class="math inline">\(\mathbb{E}_{\pi}\left[{\color{brown}R_{t+1}+\gamma G_{t+1} } \mid S_{t}=s\right]= \mathbb{E}_{\pi}\left[{\color{brown}R_{t+1}}\mid S_{t}=s\right]+{\color{brown}\gamma} \mathbb{E}_{\pi}\left[{\color{brown}G_{t+1}} \mid S_{t}=s\right]\)</span></p></li>
<li><p>收益的期望值公式：<span class="math inline">\(r(s, a)=\sum_{r \in \mathcal{R}} r \sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime}, r \mid s, a\right)\)</span>​​，因为这里引入了选择动作的概率<span class="math inline">\(\pi(a|s)\)</span>​​，所以有<span class="math inline">\(\mathbb{E}_{\pi}\left[R_{t+1}\mid S_{t}=s\right]=\sum_{a} \pi(a \mid s)\cdot r(s, a)=\sum_{a} \pi(a \mid s)\cdot \sum_{r \in \mathcal{R}} r \sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime}, r \mid s, a\right)\)</span>​​​</p></li>
<li><p>因为我们用<span class="math inline">\(S_{t+1} =s&#39;\)</span>替换<span class="math inline">\(S_{t}=s\)</span>，所以要对所有可能的<span class="math inline">\(s&#39;\)</span>求和，才能保证样本空间不变</p></li>
</ul>
<hr />
<p>对上面的结果中，<span class="math inline">\(\sum_{r \in \mathcal{R}} \sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime}, r \mid s, a\right)={\sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime} \mid s, a\right)}\)</span>，第二项对动作<span class="math inline">\(a\)</span>没有要求，所以结果可以更简练：</p>
<p><span class="math display">\[
\begin{equation}\label{Bellman equation}
\color{red}v_{\pi}(s) =\sum_{a} \pi(a \mid s) \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+\gamma v_{\pi}\left(s^{\prime}\right)\right]
\end{equation}
\]</span></p>
<p>这个方程被称为<font color=red> <span class="math inline">\(v_{\pi}\)</span> 的贝尔曼方程</font>。可以理解表达式的最后两项为期望值；这个等式就是对三个变量<span class="math inline">\(a,s&#39;,s\)</span>的求和，对于每个三元组，先计算<span class="math inline">\(\pi(a \mid s) \cdot p\left(s^{\prime}, r \mid s, a\right)\)</span>的概率值，然后对方括号内容进行加权，求和符号的意思是对所有三元组进行计算得到最终的期望值。</p>
<h3 id="远见与短视">“远见”与“短视”</h3>
<p>若令<span class="math inline">\(\gamma = 0\)</span>，即忽略后继状态的价值<span class="math inline">\(v_{\pi}(s&#39;)\)</span>，贝尔曼公式退化为收益的期望值<span class="math inline">\(\eqref{expected rewards}\)</span>：</p>
<p><span class="math display">\[
v_{\pi}(s) =\sum_{a} \pi(a \mid s) \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right) \cdot r = \sum_{a} \pi(a \mid s) \cdot r(s,a)
\]</span></p>
<p>此时的状态价值函数只对本次次动作后的收益进行预估，相当于贪心算法。当<span class="math inline">\(\gamma \neq 0\)</span>​​，即引入后继状态的价值估计后，状态价值函数会对以后的价值进行估算，使得智能体变得有“远见”。</p>
<p>同时，可以考虑<span class="math inline">\(v_{\pi}(s&#39;)\)</span>​​的来源。从证明中可以看出其来自<span class="math inline">\(E(G_{t+1})\)</span>​​，而<span class="math inline">\(G_{t+1}=R_{t+1}+R_{t+2}...\)</span>​​是对未来收益的加和。也就是说，引入了<span class="math inline">\(v_{\pi}(s&#39;)\)</span>​​​​使得这种评估方法不只看下一步的收益，还有对未来的考虑。</p>
<hr />
<p><strong>例</strong>：<span class="math inline">\(\gamma = 0.9\)</span>​​，格子的上下左右的状态价值为：2.3，0.4，-0.4，0.7；无收益；向4个方向出发的概率相同。求解中心状态价值。</p>
<p>解： 首先，选择各个动作的概率相等，有：<span class="math inline">\(\pi(a|s)=\frac{1}{4}~a\in A_t\)</span>​；没有规定状态转移概率，默认状态转移概率都是1，有<span class="math inline">\(p(s&#39;,r|s,a)=1~~s \in S_t\)</span>​​​​；没有收益，有<span class="math inline">\(r=0~~r \in R_t\)</span>。</p>
<p><span class="math display">\[
v_{\pi}(s)=\frac{1}{4} \times 1 \times 2.3+
\frac{1}{4} \times 1 \times 0.4+
\frac{1}{4} \times 1 \times (-0.4)+
\frac{1}{4} \times 1 \times 0.7=0.675
\]</span></p>
<h3 id="动作价值函数">动作价值函数</h3>
<p>同理我们可以推导出<strong>动作价值函数</strong>的贝尔曼函数</p>
<p><span class="math display">\[
\begin{aligned}
q_{\pi}(s,a) &amp; \doteq \mathbb{E}_{\pi}\left[G_{t} \mid S_{t}=s,A_t=a\right] \\
&amp;=\mathbb{E}_{\pi}\left[{\color{brown}R_{t+1}+\gamma G_{t+1} } \mid S_{t}=s,A_t=a\right] \\
&amp;=\mathbb{E}_{\pi}\left[{\color{brown}R_{t+1}}\mid S_{t}=s,A_t=a\right]+{\color{brown}\gamma} \mathbb{E}_{\pi}\left[{\color{brown}G_{t+1}} \mid S_{t}=s,A_t=a\right]\\
&amp;=\sum_{r \in \mathcal{R}} r \sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime}, r \mid s, a\right)+\gamma{\color{brown}{\sum_{s^{\prime} \in \mathcal{S}} p\left(s^{\prime} \mid s, a\right)}\sum_{a&#39;}\pi(a&#39;|s&#39;)} \mathbb{E}_{\pi}\left[G_{t+1} \mid {\color{brown}S_{t+1}=s&#39;},{\color{brown}A_{t+1}=a&#39;} \right]\\
&amp;=\sum_{r \in \mathcal{R}}  \sum_{s^{\prime} \in \mathcal{S}} r \cdot p\left(s^{\prime}, r \mid s, a\right)+ \gamma {\sum_{s^{\prime} \in \mathcal{S}}\sum_{a&#39;} p\left(s^{\prime} \mid s, a\right)} \cdot \pi(a&#39;|s&#39;) q_{\pi}(s&#39;,a’) \\
&amp;=\sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+\gamma \sum_{a^{\prime}} \pi\left(a^{\prime} \mid s^{\prime}\right) q_{\pi}\left(s^{\prime}, a^{\prime}\right)\right]
, \quad \text { for all } s \in \mathcal{S}
\end{aligned}
\]</span></p>
<p>Tip：因为我们用<span class="math inline">\(A_{t+1} =a&#39;\)</span>​​​替换<span class="math inline">\(A_{t}=a\)</span>​​​，所以要对所有可能的<span class="math inline">\(a&#39;\)</span>​​​​求和，才能保证样本空间不变。</p>
<h2 id="回溯图backupdiagram">回溯图<span class="math inline">\(backup~diagram\)</span></h2>
<p>使用空心节点表示状态，实心节点表示“状态-动作”二元组。</p>
<p><img src="image-20211024220454457.png" alt="image-20211024220454457" style="zoom: 25%;" /></p>
<p>从状态<span class="math inline">\(S\)</span>​​​​​​​​到之后的6个状态的概率各不相同，收益也各不相同；</p>
<h3 id="从回溯图理解贝尔曼公式">从回溯图理解贝尔曼公式</h3>
<p>想要理解的贝尔曼公式中的求和符号，先要明确一个概念：<span class="math inline">\(v_{\pi}(s)\)</span>​是对未来所有选择所带来收益的加权平均。由此，对照上面的回溯图我们可以理解：<span class="math inline">\(\sum_{a}\pi(s,a)\)</span>​是对选择动作所有可能性的求和，<span class="math inline">\(\sum_{s,r}p(s&#39;,r|s,a)\)</span>​是选择动作后，状态转移可能性的加和。</p>
<h3 id="状态价值与动作价值函数关系">状态价值与动作价值函数关系</h3>
<ol type="1">
<li>从回报公式</li>
</ol>
<p>对比动作价值函数和状态价值函数的定义式<span class="math inline">\(\eqref{def v_{\pi}(s)}\)</span>和<span class="math inline">\(\eqref{def p_{\pi}(s,a)}\)</span>:</p>
<p><span class="math display">\[
\begin{aligned}
v_{\pi}(s) &amp;\doteq \mathbb{E}_{\pi}\left[G_{t} \mid S_{t}=s\right]\\
q_{\pi}(s, a) &amp;\doteq \mathbb{E}_{\pi}\left[G_{t} \mid S_{t}=s, A_{t}=a\right]
\end{aligned}
\]</span></p>
<p>相对状态价值函数，动作价值函数只是固定了某一动作，对某一动作的价值进行评估；而状态价值函数是对当前状态而言，对当前状态的价值进行评估；</p>
<ol start="2" type="1">
<li>从贝尔曼公式</li>
</ol>
<p>状态价值取决于<strong>动作价值</strong>和当前策略下<strong>选择动作的概率</strong>。二者应当存在如下关系：</p>
<p><span class="math display">\[
\begin{equation}
v_{\pi}(s) = \sum_{a \in A_t} \pi(a|s) \cdot q_{\pi}(s,a) \label{relation between v and p 1}
\end{equation}
\]</span></p>
<p>回溯图表示如下：</p>
<p><img src="image-20211026102701417.png" alt="image-20211026102701417" style="zoom: 33%;" /></p>
<p>同时<strong>动作价值</strong>包括了动作后的收益和后继状态的价值，如下图所示：</p>
<p><img src="image-20211026152532511.png" alt="image-20211026152532511" style="zoom: 25%;" /></p>
<p>有如下关系：</p>
<p><span class="math display">\[
\begin{equation}
q_{\pi}(s, a) =\sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+v_{\pi}\left(s^{\prime}\right)\right] \label{relation between v and p 2}
\end{equation}
\]</span></p>
<h1 id="最优策略与最优价值函数">最优策略与最优价值函数</h1>
<p>强化学习任务就是找出一个策略<span class="math inline">\(\pi\)</span>，使得我们能在长期过程中后的最大的收益。</p>
<p>至少存在一个策略不劣于其他策略，被称为<font face="仿宋"><strong>最优策略</strong></font>（<span class="math inline">\(optimal~policy\)</span>），记为 <span class="math inline">\(\pi_*\)</span>。在最优策略下，我们获得的收益是最大的，与之对应的有 <font face="仿宋"><strong>最优状态价值函数</strong></font>（<span class="math inline">\(optimal~state-value~function\)</span>​）就是所有<strong>状态价值函数中最大的</strong>：</p>
<p><span class="math display">\[
v_{*}(s) \doteq \max _{\pi} v_{\pi}(s)
\]</span></p>
<p>同时对应的是最优动作价值函数：</p>
<p><span class="math display">\[
\begin{equation}
q_{*}(s, a) \doteq \max _{\pi} q_{\pi}(s, a) \label{q_{*}(s, a)=max}
\end{equation}
\]</span></p>
<p>之前我们提到</p>
<blockquote>
<p>状态价值取决于<strong>动作价值</strong>和当前策略下<strong>选择动作的概率</strong></p>
</blockquote>
<p>为了使得状态价值最高我们必定需要选择动作价值最高的动作，选择的概率为1，所以有：</p>
<p><span class="math display">\[
\begin{equation}
v_*(s) = \max_{a\in A} q_{\color{red} \pi_*}(s,a) \label{v_*(s)=max q}
\end{equation}
\]</span></p>
<p>由动作价值和状态价值的关系<span class="math inline">\(\eqref{relation between v and p 2}\)</span>，可以得到如下公式：</p>
<p><span class="math display">\[
\color{red}v_*(s) =\max_a \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+v_{*}\left(s^{\prime}\right)\right]
\]</span></p>
<p>这被称为<font face="仿宋" color=red><strong>贝尔曼最优方程</strong></font>（<span class="math inline">\(Bellman~optimality~equation\)</span>​），<span class="math inline">\(q_*\)</span>的贝尔曼优化方程推导如下：</p>
<p><span class="math display">\[
\begin{aligned}
q_*(s,a) 
&amp;= \max_a q_{\pi}(s,a)\\
&amp;= \max_a \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+\gamma \sum_{a^{\prime}} \pi\left(a^{\prime} \mid s^{\prime}\right) q_{\pi}\left(s^{\prime}, a^{\prime}\right)\right] \\
&amp;= \max_a \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+  q_{\pi}\left(s^{\prime}, a^{\prime}\right)\right]\\
&amp;= \color{red}\sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+  \max_a ~q_{\pi}\left(s^{\prime}, a^{\prime}\right)\right]
\end{aligned}
\]</span></p>
<p>对比公式<span class="math inline">\(\eqref{v_*(s)=max q}\)</span>和<span class="math inline">\(\eqref{q_{*}(s, a)=max}\)</span>就可以看出——<span class="math inline">\(v_*(s)=q_*(s,a)\)</span>。这也很好解释：状态价值取决于<strong>动作价值</strong>和当前策略下<strong>选择动作的概率</strong>，因为我们已经确定选择最佳动作，相当于最佳动作前的<span class="math inline">\(\pi=1\)</span>，其他的<span class="math inline">\(\pi=0\)</span>；</p>
<h1 id="实例">实例</h1>
<ol type="1">
<li>例1：如下图的持续性MDP问题，在顶部状态下，只能选择动作left或right。现在有两个策略<span class="math inline">\(\pi_{left},\pi_{right}\)</span>​，在<span class="math inline">\(\gamma = -,0.9,0.5\)</span>​时，哪个策略是最优的？</li>
</ol>
<p><img src="2021-10-27_19-53-37.png" alt="2021-10-27_19-53-37" style="zoom: 40%;" /></p>
<p>解：即求解<span class="math inline">\(v_*(s)= \max \{v_{\pi_{left}}(s)，v_{\pi_{right}}(s)\}\)</span></p>
<p><span class="math display">\[
\begin{aligned}
v_{\pi_{left}}(s) &amp;= \pi(a_1|s) \cdot p(s_1,r_1|s,a) \left[ r_1 + \gamma v_{\pi_{left}}(s_1) \right] \\
v_{\pi_{right}}(s) &amp;= \pi(a_2|s) \cdot p(s_2,r_2|s,a) \left[ r_1 + \gamma v_{\pi_{right}}(s_2) \right] \\
\\
v_{\pi_{left}}(s_1) &amp;= \pi(a_{11}|s) \cdot p(s,r_{11}|s_1,a_{11}) \left[ r_{11} + \gamma v_{\pi_{left}}(s) \right] \\
v_{\pi_{right}}(s_2) &amp;= \pi(a_{22}|s) \cdot p(s,r_{22}|s_2,a_{22}) \left[ r_{22} + \gamma v_{\pi_{left}}(s) \right] \\
\end{aligned}
\]</span></p>
<p>代入数值：</p>
<p><span class="math display">\[
\begin{aligned}
v_{\pi_{left}}(s) &amp;= 0.5 \cdot 1 \left[ 1 + \gamma v_{\pi_{left}}(s_1) \right] \\
v_{\pi_{right}}(s) &amp;= 0.5 \cdot 1 \left[ 0 + \gamma v_{\pi_{right}}(s_2) \right] \\
\\
v_{\pi_{left}}(s_1) &amp;= 1 \cdot 1 \left[ 0 + \gamma v_{\pi_{left}}(s) \right] \\
v_{\pi_{right}}(s_2) &amp;= 1 \cdot 1 \left[ 2 + \gamma v_{\pi_{left}}(s) \right] \\
\end{aligned}
\]</span></p>
<p>相互代入，有：</p>
<p><span class="math display">\[
\begin{cases}
v_{\pi_{left}}(s) &amp;= \frac{1}{1-\gamma^2}\\
v_{\pi_{right}}(s) &amp;= \frac{2\gamma}{2-\gamma^2}\\
\end{cases}
\]</span></p>
<p>所以，<span class="math inline">\(\gamma = 0\)</span>​左好，<span class="math inline">\(\gamma = 0.5\)</span>​相同，<span class="math inline">\(\gamma = 0.9\)</span>​​​右好。可见，随着<span class="math inline">\(\gamma\)</span>​增大，智能体越来越来有远见。</p>
<p>像上面这种把所有价值的贝尔曼函数都以显式给出的求解方式被称为<strong>显示求解方法</strong>，这种方法类似于穷举搜索，实际上这个方法能求解的问题很少，因为问题必须满足下面三点才能用显式求解：</p>
<ul>
<li>准确知道环境变化特性</li>
<li>有足够计算资源求解</li>
<li>马尔可夫性</li>
</ul>
<h1 id="本章小结">本章小结</h1>
<p>本章中引入了非常多的概念，首先需要明确的是<strong>两个最基本的概率</strong>：状态转移概率<span class="math inline">\(p(s&#39;|s,a)\)</span>和动作选择概率<span class="math inline">\(\pi(a|s)\)</span>：</p>
<p><img src="2021-10-26-20-53-18.jpg" alt="2021-10-26-20-53-18" style="zoom:50%;" /></p>
<p>上面的图片很好说明了这里这两个概率都在何处起作用；首先选择某个动作是有一定概率的，选择动作后，转移到某个状态同样是存在一定几率的；我们能做的，顶多是选择某个动作，而状态转移的概率完全是环境决定，我们一般无法更改的。</p>
<p>第二要理解贝尔曼方程的意义、状态价值和动作价值的关系；</p>
<p>第三是理解最优价值函数的意义。</p>
<p>此外，我们这里还遗留有问题，观察贝尔曼公式<span class="math inline">\(\eqref{Bellman equation}\)</span>，我们可以发现其中包含了未来项<span class="math inline">\(v&#39;(s)\)</span>，这一项赋予我们“远见”，但同时也带来一个问题：<span class="math inline">\(v&#39;(s)\)</span>​是未来的项，在实际计算时如何“未卜先知”得到这一项呢？​</p>
<p>第二，当我们获得了价值后，如何改进我们的算法呢？</p>
<p>这两个问题分别是评估和改进，后面的章节会围绕这两点研究。</p>

    </div>

    
    
    

      <footer class="post-footer">
          <div class="post-tags">
              <a href="/tags/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/" rel="tag"># 强化学习</a>
              <a href="/tags/MDP/" rel="tag"># MDP</a>
              <a href="/tags/%E8%B4%9D%E5%B0%94%E6%9B%BC%E5%85%AC%E5%BC%8F/" rel="tag"># 贝尔曼公式</a>
              <a href="/tags/%E5%9B%9E%E6%BA%AF%E5%9B%BE/" rel="tag"># 回溯图</a>
          </div>

        


        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/2021/10/19/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/2-%E5%A4%9A%E8%87%82%E6%9C%BA/" rel="prev" title="2-多臂机">
      <i class="fa fa-chevron-left"></i> 2-多臂机
    </a></div>
      <div class="post-nav-item">
    <a href="/2021/10/30/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/4-%E5%8A%A8%E6%80%81%E8%A7%84%E5%88%92/" rel="next" title="4-动态规划">
      4-动态规划 <i class="fa fa-chevron-right"></i>
    </a></div>
    </div>
      </footer>
    
  </article>
  
  
  



          </div>
          

<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%9F%BA%E6%9C%AC%E6%A6%82%E5%BF%B5"><span class="nav-number">1.</span> <span class="nav-text">基本概念</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%9B%9E%E6%8A%A5%E4%B8%8E%E5%88%86%E5%B9%95"><span class="nav-number">2.</span> <span class="nav-text">回报与分幕</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E7%AD%96%E7%95%A5%E5%92%8C%E4%BB%B7%E5%80%BC%E5%87%BD%E6%95%B0"><span class="nav-number">3.</span> <span class="nav-text">策略和价值函数 ​</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%9F%BA%E6%9C%AC%E6%A6%82%E5%BF%B5-1"><span class="nav-number">3.1.</span> <span class="nav-text">基本概念</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E4%BB%B7%E5%80%BC%E5%87%BD%E6%95%B0"><span class="nav-number">3.2.</span> <span class="nav-text">价值函数</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E8%B4%9D%E5%B0%94%E6%9B%BC%E6%96%B9%E7%A8%8B"><span class="nav-number">3.3.</span> <span class="nav-text">贝尔曼方程</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#%E7%8A%B6%E6%80%81%E4%BB%B7%E5%80%BC%E5%87%BD%E6%95%B0"><span class="nav-number">3.3.1.</span> <span class="nav-text">状态价值函数</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E8%BF%9C%E8%A7%81%E4%B8%8E%E7%9F%AD%E8%A7%86"><span class="nav-number">3.3.2.</span> <span class="nav-text">“远见”与“短视”</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E5%8A%A8%E4%BD%9C%E4%BB%B7%E5%80%BC%E5%87%BD%E6%95%B0"><span class="nav-number">3.3.3.</span> <span class="nav-text">动作价值函数</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%9B%9E%E6%BA%AF%E5%9B%BEbackupdiagram"><span class="nav-number">3.4.</span> <span class="nav-text">回溯图\(backup~diagram\)</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#%E4%BB%8E%E5%9B%9E%E6%BA%AF%E5%9B%BE%E7%90%86%E8%A7%A3%E8%B4%9D%E5%B0%94%E6%9B%BC%E5%85%AC%E5%BC%8F"><span class="nav-number">3.4.1.</span> <span class="nav-text">从回溯图理解贝尔曼公式</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#%E7%8A%B6%E6%80%81%E4%BB%B7%E5%80%BC%E4%B8%8E%E5%8A%A8%E4%BD%9C%E4%BB%B7%E5%80%BC%E5%87%BD%E6%95%B0%E5%85%B3%E7%B3%BB"><span class="nav-number">3.4.2.</span> <span class="nav-text">状态价值与动作价值函数关系</span></a></li></ol></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E6%9C%80%E4%BC%98%E7%AD%96%E7%95%A5%E4%B8%8E%E6%9C%80%E4%BC%98%E4%BB%B7%E5%80%BC%E5%87%BD%E6%95%B0"><span class="nav-number">4.</span> <span class="nav-text">最优策略与最优价值函数</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%AE%9E%E4%BE%8B"><span class="nav-number">5.</span> <span class="nav-text">实例</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E6%9C%AC%E7%AB%A0%E5%B0%8F%E7%BB%93"><span class="nav-number">6.</span> <span class="nav-text">本章小结</span></a></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="谢祖浩"
      src="/images/head.jpeg">
  <p class="site-author-name" itemprop="name">谢祖浩</p>
  <div class="site-description" itemprop="description">驽马十驾，功在不舍</div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">9</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/categories/">
          
        <span class="site-state-item-count">2</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/tags/">
          
        <span class="site-state-item-count">13</span>
        <span class="site-state-item-name">标签</span></a>
      </div>
  </nav>
</div>



      </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

        

<div class="copyright">
  
  &copy; 
  <span itemprop="copyrightYear">2022</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">谢祖浩</span>
</div>
  <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://pisces.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Pisces</a> 强力驱动
  </div>

        








      </div>
    </footer>
  </div>

  
  <script src="/lib/anime.min.js"></script>
  <script src="/lib/velocity/velocity.min.js"></script>
  <script src="/lib/velocity/velocity.ui.min.js"></script>

<script src="/js/utils.js"></script>

<script src="/js/motion.js"></script>


<script src="/js/schemes/pisces.js"></script>


<script src="/js/next-boot.js"></script>




  















  

  
      

<script>
  if (typeof MathJax === 'undefined') {
    window.MathJax = {
      loader: {
        source: {
          '[tex]/amsCd': '[tex]/amscd',
          '[tex]/AMScd': '[tex]/amscd'
        }
      },
      tex: {
        inlineMath: {'[+]': [['$', '$']]},
        tags: 'ams'
      },
      options: {
        renderActions: {
          findScript: [10, doc => {
            document.querySelectorAll('script[type^="math/tex"]').forEach(node => {
              const display = !!node.type.match(/; *mode=display/);
              const math = new doc.options.MathItem(node.textContent, doc.inputJax[0], display);
              const text = document.createTextNode('');
              node.parentNode.replaceChild(text, node);
              math.start = {node: text, delim: '', n: 0};
              math.end = {node: text, delim: '', n: 0};
              doc.math.push(math);
            });
          }, '', false],
          insertedScript: [200, () => {
            document.querySelectorAll('mjx-container').forEach(node => {
              let target = node.parentNode;
              if (target.nodeName.toLowerCase() === 'li') {
                target.parentNode.classList.add('has-jax');
              }
            });
          }, '', false]
        }
      }
    };
    (function () {
      var script = document.createElement('script');
      script.src = '//cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js';
      script.defer = true;
      document.head.appendChild(script);
    })();
  } else {
    MathJax.startup.document.state(0);
    MathJax.texReset();
    MathJax.typeset();
  }
</script>

    

  

</body>
</html>
